Generate Passport Index datasets


In [1]:
from bs4 import BeautifulSoup
import requests
import js2py
import pandas as pd
from itertools import permutations

Read PassportIndex page

Use Beautiful Soup to extract JS with arrays containing data on visa free, visa on arrival, and eta.


In [2]:
source = requests.get('https://www.passportindex.org/comparebyPassport.php').content.decode('utf-8')

In [3]:
soup = BeautifulSoup(source)

for s in soup.findAll('script'):
    if 'com_c_vf' in s.text:
        script = 'function a() {' \
            + s.text + \
            '; return [\
                Object.keys(com_c_vf).map(function(z){return [z, com_c_vf[z]]}),\
                Object.keys(com_c_vf).map(function(z){return [z, com_c_voa[z]]}),\
                Object.keys(com_c_vf).map(function(z){return [z, com_c_eta[z]]}),\
                Object.keys(com_c_vf).map(function(z){return [z, so_vf[z]]}),\
            ]};a()'

data = js2py.eval_js(script)

In [4]:
so = {
    '0': 'VF',
    '1': 7,
    '2': 14,
    '3': 90,
    '4': 28,
    '5': 30,
    '6': 180,
    '7': 360,
    '8': 31,
    '9': '-',
    '12': 60,
    '13': 15,
    '14': 120,
    '15': 240,
    '16:': 45,
    '17': 21,
    '18': 42,
}

Begin data processing


In [5]:
url = 'https://gist.githubusercontent.com/ilyankou/b2580c632bdea4af2309dcaa69860013/raw/420fb417bcd17d833156efdf64ce8a1c3ceb2691/country-codes'
codes = pd.read_csv(url, dtype=str).fillna('NA').set_index('ISO2')

def fix_iso2(x):
    o = {
        'UK': 'GB',
        'RK': 'XK'
    }
    return o[x] if x in o else x

In [6]:
multiindex = pd.MultiIndex.from_tuples(
    list(permutations(codes.index, 2)), names=['Passport', 'Destination']
)

# By default, all countries need visas, so set all values to 0
tidy_iso2 = pd.DataFrame(index=multiindex)
tidy_iso2['Code'] = 'VR'

# i=0 for visa free (so value of 3)
# 1=1 for visa on arrival (value of 2)
# i=2 for eta (value of 1)
for i in range(3):
    for j in range(len(data[i])):
        passport = fix_iso2( data[i][j][0] ) # correct UK and Kosovo codes
        countries = data[i][j][1].split(',')
        
        # For visa free, we put number of dates
        if i == 0:
            vf2days = data[3][j][1].split(',')
            
        for k in range(len(countries)):
            country = countries[k]

            if country == '':
                continue
                
            country = fix_iso2(country)  # correct UK and Kosovo codes
            
            tidy_iso2.loc[(passport, country), 'Code'] = so[vf2days[k]] if i == 0 else 'ETA' if i == 1 else 'VOA'

Save ISO-2 files, both matrix and tidy


In [7]:
tidy_iso2.to_csv('passport-index-tidy-iso2.csv')

tidy_iso2.reset_index().pivot(columns='Destination', index='Passport', values='Code')\
    .fillna(-1).to_csv('passport-index-matrix-iso2.csv')

Translate ISO-2 into ISO-3 and save both datasets


In [8]:
tidy_iso3 = tidy_iso2.copy(deep=True).reset_index()
tidy_iso3['Passport'] = tidy_iso3['Passport'].apply(lambda x: codes.loc[x]['ISO3'])
tidy_iso3['Destination'] = tidy_iso3['Destination'].apply(lambda x: codes.loc[x]['ISO3'])

tidy_iso3.to_csv('passport-index-tidy-iso3.csv', index=False)
tidy_iso3.reset_index().pivot(columns='Destination', index='Passport', values='Code')\
    .fillna(-1).to_csv('passport-index-matrix-iso3.csv')

Translate ISO-2 into ISO-3 and save both datasets


In [9]:
tidy_names = tidy_iso2.copy(deep=True).reset_index()
tidy_names['Passport'] = tidy_names['Passport'].apply(lambda x: codes.loc[x]['Country'])
tidy_names['Destination'] = tidy_names['Destination'].apply(lambda x: codes.loc[x]['Country'])

tidy_names.to_csv('passport-index-tidy.csv', index=False)
tidy_names.reset_index().pivot(columns='Destination', index='Passport', values='Code')\
    .fillna(-1).to_csv('passport-index-matrix.csv')

In [ ]:


In [ ]: